1 Pre-processing

1.1 Load packages

library(Signac)
library(Seurat)
library(ggplot2)
library(ggpubr)
library(tidyverse)
library(plyr)
library(reshape2)
library(data.table)
library(GenomicRanges)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(hdf5r)
library(stringr)
library(ggpubr)
library(RColorBrewer)
library(magick)
library(knitr) 
library(kableExtra)
library(scater)
set.seed(123)

1.2 Parameters

Here we set up the parameters of ATAC and RNA seq analysis.

# Thresholds
# ATAC-seq
TSS_enrichment <- 2 
nucleosome_signal_atac <- 2
min_lib_size_atac<- 300
max_lib_size_atac <- 200000


#RNA-seq
min_lib_size_rna<-470
max_lib_size_rna<-40000
min_ngenes_rna<-250
max_ngenes_rna<-7000
max_percent_mit <-20

PATH

path_to_save <- "results/R_objects/"

2 Functions

Here we will set the functions that will be used in the quality control (QC) process

2.1 Create sample and a fragment path list

#function that creates list of the sample path of the file h5
make.sample.path <- function(path){
  sample.path=list()
  
                sample.path <-list.files(path=path,
                      pattern="h5", recursive = T, full.names = TRUE) 
               
                return(sample.path)
        
}

make.fragpath <- function(path){
  sample.path<-list()
  
                sample.path <-list.files(path=path,
                      pattern=".tsv.gz$", recursive = T, full.names = TRUE) 
               
                return(sample.path)
        
}

2.2 Create a multiome Seurat Object list

In this function called make.SeuratObject.list we will create a list of multiome of seurat object with chromatin assay data. It has the 2 previous function nested.

make.SeuratObject.list <- function(path){
  sample.name <-list()
  sample.pathList <-list()
  tonsil_list <-list()
  fragpathList<-list()
  
  # get gene annotations for hg38
  sample.pathList<- make.sample.path(path)
  fragpathList<- make.fragpath(path)
  sample.name<- sub(".*Experiment/.*\\/(.*)\\/filtered_feature_bc_matrix.h5","\\1", sample.pathList, perl = TRUE )
  annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
  seqlevelsStyle(annotation) <- "UCSC"
  genome(annotation) <- "hg38"
                 
  for (i in seq_along(sample.pathList)){
    
                  counts = Read10X_h5(filename=sample.pathList[[i]])
                  fragpath <-fragpathList[[i]]
                  tonsil<-CreateSeuratObject(counts = counts$`Gene Expression`, assay = "RNA")
                 
                  tonsil[["ATAC"]] <- CreateChromatinAssay(
                    counts = counts$Peaks,
                    sep = c(":", "-"),
                    genome="hg38",
                    fragments = fragpath,
                    annotation = annotation
                    )
                
                  
                  tonsil_list[[sample.name[i]]] = assign(sample.name[[i]], tonsil, envir=.GlobalEnv)
                                        } 
  return(tonsil_list)
                        }

2.3 Filtering

Once we set all the quality control metric we will use this function to filter out all the cell that are not within the thresholds.

filtering.cell <- function(seurat_object.list){
  
  seurat_object <- subset(x = seurat_object.list,
              subset = nCount_ATAC < max_lib_size_atac &
              nCount_RNA < max_lib_size_rna &
              nCount_ATAC > min_lib_size_atac &
              nCount_RNA > min_lib_size_rna &
              nFeature_RNA > min_ngenes_rna &
              TSS.enrichment > TSS_enrichment &
              nucleosome_signal < nucleosome_signal_atac &
              percent.mt < max_percent_mit
              )
  
  return(seurat_object)
              
}

tecnical.cell <- function(seurat_object.list){
  
  seurat_object <- subset(x = seurat_object.list,
              
              nFeature_RNA < min_ngenes_rna 
          
              )
  
  return(seurat_object)
              
}

Function to save the filtered out cells. (it is not working)

filter.out.cell <- function(seurat_object.list){
  
  seurat_object <- subset(x = seurat_object.list,
              !(subset = nCount_ATAC < max_lib_size_atac &
              nCount_RNA < max_lib_size_rna &
              nCount_ATAC > min_lib_size_atac &
              nCount_RNA > min_lib_size_rna &
              nFeature_RNA > min_ngenes_rna &
              TSS.enrichment > TSS_enrichment &
                percent.mt < max_percent_mit &
              nucleosome_signal < nucleosome_signal_atac)
              )
  
  return(seurat_object)
              
}

2.4 Quality control function

This function will carry out the Nucleosome signal using NucleosomeSignalfunction, the TSS enrichement using TSSEnrichmentfunction and also it will calculate the fraction of mitocondria and ribosomal RNA in each sample. - Michochondrial genes are useful indicators of cell state. - We can define ribosomal proteins (their names begin with RPS or RPL), which often take substantial fraction of reads:

QC <- function(SeuratObject.list){

  
  DefaultAssay(SeuratObject.list) <- "ATAC"
  SeuratObject.list <- NucleosomeSignal(SeuratObject.list)
  SeuratObject.list <- TSSEnrichment(SeuratObject.list, fast = FALSE)
  SeuratObject.list$tss.level <- ifelse(SeuratObject.list$TSS.enrichment > 2, "High", "Low")
    
  DefaultAssay(SeuratObject.list)<-"RNA"
  SeuratObject.list[["percent.mt"]] <- PercentageFeatureSet(SeuratObject.list, pattern = "^MT-")
  SeuratObject.list[["percent_ribo"]] <- PercentageFeatureSet(SeuratObject.list, pattern = "^RP[SL]")

return(SeuratObject.list)

}

2.5 Create a data frame of each sample’s metadata

After the QC process, we can use this function to make a data frame with only the metadata of library.

md_df<- data.frame()
make.metadata.df<- function (all_data){
  
  md_df = rbind(md_df, all_data@meta.data)
  
  return(md_df)
}

2.6 TSS enrichment plot

tss.enrich.plot <- function(seurat_object_df) {
  
  for (i in seq_along(seurat_object_df)){
    
  DefaultAssay(seurat_object_df[[i]]) <- "ATAC"
  fh <- FragmentHistogram(object = seurat_object_df[[i]])
  
    
  p <- TSSPlot(seurat_object_df[[i]], group.by = 'tss.level') + 
    ggtitle(paste0("TSS enrichment score of ",unique(names(seurat_object_df[i])))) +
    theme_minimal() +
    geom_vline(xintercept =c(0,220),linetype = "dashed", colour = "black") + 
    xlab(bquote('Relative Position (bp form TSS')) +
  ylab(expression("Relative enrichement")) 
  
  print(p)
}
}

3 Load data and create Seurat Object

Signac uses information from three related input files (created using CellRanger ARC):

  1. Count matrix in h5 format
  2. ATAC Fragment file
  3. ATAC Fragment file index

3.1 Seurat Object

ReadExperiment 1

path_exp1<-"../data/Experiment/1"
SeuratObject.Exp1<-make.SeuratObject.list(path_exp1)

Read Experiment 2

path_exp2<-"../data/Experiment/2"
SeuratObject.Exp2<-make.SeuratObject.list(path_exp2)

list.name is a list of id sample with each library name.

list.name<-list(
pd9avu0k_kf9ft6kk="BCLL_14_T_1",
vuuqir4h_wfkyb5v8 ="BCLL_14_T_2",
admae8w2_89i88tvv="BCLL_15_T_1" ,
sr20954q_yiuuoxng="BCLL_15_T_2" ,
kmbfo1ab_ie02b4ny="BCLL_2_T_1" ,
ryh4el3i_biv0w7ca="BCLL_2_T_2" ,
bs2e7lr7_mdfwypvz="BCLL_2_T_3" ,
co7dzuup_xuczw9vc="BCLL_9_T_1" ,
qmzb59ew_t11l8dzm="BCLL_9_T_2" ,
ulx1v6sz_8a2nvf1c="BCLL_8_T_1" ,
wdp0p728_jf6w68km="BCLL_8_T_2")
list.name<-list.name[order(names(list.name))]

3.1.1 Joint Experiment 1 and 2 and change sample names by library name

SeuratObject.list<- c(SeuratObject.Exp1,SeuratObject.Exp2)
SeuratObject.list<-SeuratObject.list[order(names(SeuratObject.list))]

#Change sample id by library name. 
names(SeuratObject.list)<- c(list.name[[1]],list.name[[2]],list.name[[3]],list.name[[4]],list.name[[5]],list.name[[6]],list.name[[7]],list.name[[8]],list.name[[9]],list.name[[10]],list.name[[11]])

4 Quality control

Quality control metrics are collected to determine library complexity, signal to noise ratios, fragment length distribution per replicate (where available), and reproducibility.

Here we will create a list called ’all_data` which has all the quality control of each Seurat object of each samples.

Look at the distributions before deciding on cutoffs.

all_data = lapply(SeuratObject.list, QC)

4.1 Create a data frame of the metadata

We will creat a data frame with only the metadata of each sample which will be called metada_df.

df<-lapply(all_data, make.metadata.df)
##create a data frame of all metadata 
metadata_df <- map_df(df, ~as.data.frame(.x), .id="id")
metadata_df<-metadata_df[order(metadata_df$id),]

write.csv(metadata_df, "../results/tables/metadata_df_qc.csv", row.names=TRUE, quote=FALSE) 

4.1.1 Table with the initial number of cell of each sample

num_cell<- as.data.frame(ddply(metadata_df, .(id), nrow))

#change column names
colnames(num_cell) <- c("library_name","initial_cells")

One important step is to know of the exactly number of initial cells we have in each sample to know the percentage of cell we are going to keep after the filtering process.

Table 1: Table of total number of initial barcoded cell of each library
Total Number
library_name initial_cells
BCLL_14_T_1 6919
BCLL_14_T_2 6103
BCLL_15_T_1 5788
BCLL_15_T_2 5845
BCLL_2_T_1 10837
BCLL_2_T_2 7910
BCLL_2_T_3 7460
BCLL_8_T_1 7181
BCLL_8_T_2 7861
BCLL_9_T_1 5572
BCLL_9_T_2 5530

4.2 scATAC parameters

4.2.1 Number of detected peaks

ggviolin(
metadata_df,
  x="id",
  y="nFeature_ATAC",
  fill="steelblue",
  add="boxplot", 
  title = "Number of detected peaks",
  ggtheme = theme_pubr(x.text.angle = 20),
  add.params = list(fill = "white"))+
  geom_hline(yintercept = 18000, linetype='dashed', col = 'red')+
  labs(x = "Library name", y = "Number of detected Peaks")

We can easily see that the sample BCLL_2 which belong to the adult sample have a sifnifcant different of number of peak distribution. This difference could low the general average of number of peaks.

  • The median number of peaks is: 3331

  • The mean number of peaks is: 4653.39

The median number of peaks per library

aggregate(nFeature_ATAC ~ id, data = metadata_df, median)
            id nFeature_ATAC
1  BCLL_14_T_1        5009.0
2  BCLL_14_T_2        5001.0
3  BCLL_15_T_1        4264.0
4  BCLL_15_T_2        4160.0
5   BCLL_2_T_1        1611.0
6   BCLL_2_T_2        1598.5
7   BCLL_2_T_3        2223.0
8   BCLL_8_T_1        4656.0
9   BCLL_8_T_2        4355.0
10  BCLL_9_T_1        5552.0
11  BCLL_9_T_2        5461.5

In the table above shows clearly that the median of peaks of BCLL2 libraries are significantly different that the rest. Their median are almost 4 time lower than the other ones.

4.3 Nucleosome Banding pattern

Nucleosome banding pattern: The histogram of DNA fragment sizes (determined from the paired-end sequencing reads) should exhibit a strong nucleosome banding pattern corresponding to the length of DNA wrapped around a single nucleosome. We calculate this per single cell, and quantify the approximate ratio of Di-nucleosomal(DI) and mono-nucleosomal(MONO) to nucleosome-free(NFR) fragments.

Single Cell ATAC read pairs produce detailed information about nucleosome packing and positioning. The fragment length distribution captures the nucleosome positioning periodicity.

Histogram are divided by NFR, MonoNR and DiNR. Dashed red lines represent the thresholds we set according our observation which is base on the lowest parts of the histogram line. Back straight lines are threshold (147 and 294) base on bibliography, length of DNA wrapped around a single nucleosome. Ratio of fragments of each part made by both bibliography and our own threshold are represented in back and in red respectively.

The plot can be used to evaluate the quality of transposase reaction. We expect to find half or more of the of fragments within the nucleosome free regions (NFR) to confirm that our data is high quality and the transposase worked properly.

Biobliography information about the minimum and maximum thresolds. Nucleosome signal. The length of DNA wrapped around a single nucleosome has been experimentally determined as 147 bp. As Tn5 has a strong preference to integrate into nucleosome-free DNA, successful ATAC-seq experiments typically exhibit a depletion of DNA fragments with lengths that are multiples of 147 bp. We defined the nucleosome signal QC metric in Signac as the ratio of mononucleosomal (147–294 bp) to nucleosome-free (<147 bp) fragments sequenced for the cell, as a way of quantifying the expected depletion of nucleosome-length DNA fragments.


nucleosome.bp <- function(seurat_object_df) {
  
  for (i in seq_along(seurat_object_df)){
    
  DefaultAssay(seurat_object_df[[i]]) <- "ATAC"
  fh <- FragmentHistogram(object = seurat_object_df[[i]])
  
  
  min.threshold<-147
  max.threshold<-294
  my.min.threshold <- 127
  my.max.threshold <- 274
  
  NFR<- (length(which(fh$data$length < min.threshold))/ nrow(fh$data)) *100
  MonoNR<- (length(which(fh$data$length > min.threshold &fh$data$length < max.threshold ))/ nrow(fh$data)) *100
  DiNR<- (length(which(fh$data$length > max.threshold))/ nrow(fh$data)) *100
  my.NFR<- (length(which(fh$data$length < my.min.threshold))/ nrow(fh$data)) *100
  my.MonoNR<- (length(which(fh$data$length > my.min.threshold & fh$data$length < my.max.threshold ))/ nrow(fh$data)) *100
  my.DiNR<- (length(which(fh$data$length > my.max.threshold))/ nrow(fh$data)) *100

  p <- ggplot(fh$data, aes(length)) + 
    ggtitle(paste0("Nucleosome banding pattern of ",unique(names(seurat_object_df[i])))) +
    geom_histogram(binwidth = 1, fill = "steelblue") +
    geom_density(aes(y = ..count..), bw = 1, alpha = 0, col = "black", lwd = 1) + scale_x_continuous(limits = c(0, 550)) +
    geom_vline(xintercept = c(my.min.threshold, my.max.threshold),linetype="dashed",colour = "red") +
    geom_vline(xintercept = c(min.threshold, max.threshold)) +
    theme_minimal()+
    geom_text(x = 80, y = 50, label = paste("NFR",round(NFR, 2)), size = 3) +
    geom_text(x = 200, y = 50, label = paste("MONO",round(MonoNR, 2)), size = 3) +
    geom_text(x = 350, y = 50, label = paste("DI",round(DiNR, 2)), size = 3)+
    geom_text(x = 80, y = 100, label = round(my.NFR, 2), size = 3, color="red") +
    geom_text(x = 200, y = 100, label = round(my.MonoNR, 2), size = 3, color="red") +
    geom_text(x = 350, y = 100, label = round(my.DiNR, 2), size = 3, color="red")
  
  print(p)
}
}

t<-nucleosome.bp(all_data)

Insert size distributions of the aggregated single cells from all eleven samples exhibited clear nucleosoma banding patterns.

ns <- ggviolin(metadata_df,
  x = "id", fill = "steelblue", x.text.angle = 45,
  y = "nucleosome_signal",
  title = "Nucleosome signal distribution(log10)",
) + scale_y_log10() + geom_hline(yintercept = nucleosome_signal_atac, linetype='dashed', col = 'red')
ns + labs(x = "Library name", y = "Nucleosome signal")
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 8 rows containing non-finite values (stat_ydensity).

## TSS enrichment

Transcriptional start site (TSS) enrichment score. The ENCODE project has defined an ATAC-seq targeting score based on the ratio of fragments centered at the TSS to fragments in TSS-flanking regions. Poor ATAC-seq experiments typically will have a low TSS enrichment score.

TSS scores = the depth of TSS (each 100bp window within 1000 bp each side) / the depth of end flanks (100bp each end).

TSSE score = max(mean(TSS score in each window))

To plot TSS enrichment profiles, we use the TSSPlot() function. TSS enrichment profiles show a clear peak in the center and a smaller shoulder peak at the downstream of the TSS (TSS + 220) could be the spacing region between two flanking nucleosomes. Two vertical back dashed lines are given at the consensus TSS (hg38 genome) and at TSS + 220 bp.

We can see clearly that the reads concentrate around the TSS, with a prominent peak a bit upstream


tss<-tss.enrich.plot(all_data)

Now, we are going to show the distribution of the TSS scores for each samples. Red dashes line represents the cut off used to slip the data by low and high TSS score.

ls<- ggviolin(metadata_df,
              x = "id", fill = "steelblue", x.text.angle = 25,y = "TSS.enrichment")  +
  geom_hline(yintercept = 2, linetype='dashed', col = 'red')+ 
  labs(x = "Library name", y = "TSS enrichment score")

ls1<- ls + scale_y_log10()+ ggtitle("TSS enrichment score (log10)")

ls + ggtitle("TSS enrichment score")

ls1

4.3.1 Library sizes

The library size of ATAC-seq make reference to the number of reads

ls1<- ggboxplot(metadata_df,
              x = "id", fill = "steelblue", x.text.angle = 25,
  y = "nCount_ATAC", title = "Library size (log10)") + 
  scale_y_log10() +   
  geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'red') +
  labs(x = "Library name", y = "Number of counts ATAC-seq")

ls1

ls<- ggviolin(metadata_df,
              x = "id", fill = "steelblue", x.text.angle = 25,
  y = "nCount_ATAC", title = "Library size (log10)" ,add="boxplot", add.params = list(fill = "white")) + 
  scale_y_log10() +   
  geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'red')+
  labs(x = "Library name", y = "Number of counts ATAC-seq")

ls 


lib_size_hist_log <- metadata_df  %>%
  ggplot(aes_string("nCount_ATAC")) +
  geom_histogram(bins = 100) +
  labs(x = "Library Size (log10)", y = "Number of Cells")+
  theme_pubr()+
  scale_x_log10() +
  geom_vline(xintercept = c(min_lib_size_atac,max_lib_size_atac), linetype = "dashed", color = "red")

lib_size_hist <- lib_size_hist_log +
    scale_x_continuous(limits = c(0, 5000)) +
    xlab("Library Size") +
    theme_pubr()

lib_size_hist_log

lib_size_hist

4.4 RNA-seq parameters

4.4.1 Library sizes

Lower and the upper thresholds that we have set are shown as red dashed lines.

Here we can see the distribution of the library size of RNAm each barcoded cell from each sample.


ls<- ggviolin(metadata_df,
              x = "id", fill = "steelblue", x.text.angle = 25,
  y = "nCount_RNA", title = "Library size of RNA-seq (log10)" ,add="boxplot", add.params = list(fill = "white")) + 
  scale_y_log10() +   
  geom_hline(yintercept = c(min_lib_size_rna,max_lib_size_rna), linetype='dashed', col = 'red')+
  labs(x = "Library name", y = "Number of counts RNA-seq")

ls 


lib_size_hist_log <- metadata_df  %>%
  ggplot(aes_string("nCount_RNA")) +
  geom_histogram(bins = 100) +
  labs(x = "Library Size (total UMI)(log10)", y = "Number of Cells")+
  theme_pubr()+
  scale_x_log10() +
  geom_vline(xintercept = c(min_lib_size_rna,max_lib_size_rna), linetype = "dashed", color = "red")

lib_size_hist <- lib_size_hist_log +
    scale_x_continuous(limits = c(0, 3000)) +
    xlab("Library Size (total UMI)") +
    theme_pubr()

lib_size_hist_log + lib_size_hist

4.4.2 Number of features (detected genes)

Number of detected genes are also the library complexity.

High number of detected genes may be an indication of duplicate/multiple cells. But can also be a larger celltype.

We expect to fin a large library complexity in a large library size. If we have a low library complexity ( low detected genes) in a large library size, it means that there a lot of duplicate and copie fragments.

We have set a lower threshold in order to remove cells with too few genes.

metadata_df$has_high_lib_size <- 
  metadata_df$nCount_RNA > min_lib_size_rna &
  metadata_df$nFeature_RNA > min_ngenes_rna

We use log10 scale to have a better visualization of the distribution data.

ls<- ggviolin(metadata_df,
              x = "id", 
              fill = "steelblue", 
              x.text.angle = 25,
              y = "nFeature_RNA", 
              title = "Number of detected genes (log10)",
              add="boxplot", 
              add.params = list(fill = "white")) + 
  geom_hline(yintercept = c(min_ngenes_rna,max_ngenes_rna), linetype='dashed', col = 'red')+
  scale_y_log10() 

ls 

We want to delete the fist peak which represent technical issues and we are not interesting on keeping that because it will bias our results.

ngenes_hist <- metadata_df %>%
  ggplot(aes_string("nFeature_RNA")) +
  geom_histogram(bins = 100) +
  labs(x = "Number of Detected Genes", y = "Number of Cells") +
  theme_pubr()+
  geom_vline(xintercept = min_ngenes_rna, linetype = "dashed", color = "red") +
  geom_vline(xintercept = max_ngenes_rna, linetype = "dashed", color = "red")


ngenes_hist

  • The median of total number of genes is: 1764

  • The median number of genes per library:

aggregate(nFeature_RNA ~ id, data = metadata_df, median)
            id nFeature_RNA
1  BCLL_14_T_1         1736
2  BCLL_14_T_2         1659
3  BCLL_15_T_1         1713
4  BCLL_15_T_2         1895
5   BCLL_2_T_1         1291
6   BCLL_2_T_2         1584
7   BCLL_2_T_3         1586
8   BCLL_8_T_1         2318
9   BCLL_8_T_2         2117
10  BCLL_9_T_1         1997
11  BCLL_9_T_2         2142

We can see that BCLL2 samples on average have lower number of detected genes than majority of samples.

4.4.3 Tecnical issues cells

tt<-lapply(all_data, tecnical.cell)
df.ti<-lapply(tt, make.metadata.df)
metadata_df.ttt <- map_df(df.ti, ~as.data.frame(.x), .id="id")
metadata_df.ttt<-metadata_df.ttt[order(metadata_df.ttt$id),]

ngenes_hist <- metadata_df.ttt %>%
  ggplot(aes_string("nFeature_RNA")) +
  geom_histogram(bins = 100) +
  labs(x = "Number of Detected Genes", y = "Number of Cells") +
  theme_pubr()+
  geom_vline(xintercept = min_ngenes_rna, linetype = "dashed", color = "red") 
    


ngenes_hist1<- ngenes_hist + geom_vline(xintercept = max_ngenes_rna, linetype = "dashed", color = "red")
ngenes_hist1 + ngenes_hist

4.4.4 Correlation library size vs library complexity (detected genes)

Here we can see that there is a corelation between the library size and the library complexity in each sample. Tha means that we have a high variability of genes. It can suggest that more average number of read more genes we find.

corr_lib_size_lib_comp<-ggscatter(metadata_df,y="nFeature_RNA", x="nCount_RNA",
          color="blue",
          add="reg.line", 
          conf.int = T,
          cor.coef = T,
          ylim=c(0,20000),
          title = "Correlation between lib. size and lib. complexity",
          cor.coeff.args = list(method = "pearson", label.x = 10000, label.sep = "\n"))

corr_lib_size_lib_comp1<-ggscatter(metadata_df,y="nFeature_RNA", x="nCount_RNA",
          color="id",
          add="reg.line", 
          conf.int = T,
          cor.coef = T,
          ylim=c(0,20000),
          title = "Correlation between lib. size and lib. complexity",
          cor.coeff.args = list(method = "pearson", label.x = 10000, label.sep = "\n"))
corr_lib_size_lib_comp + labs(y = "Number of Detected Genes", x = "Library size")

corr_lib_size_lib_comp1 + labs(y = "Number of Detected Genes", x = "Library size")

4.5 Mitocondrial RNA expression fraction

Suggested that when the cell membrane is broken, cytoplasmic RNA will be lost, but not RNAs enclosed in the mitochondria. High content of mitochondrial RNA may indicate apoptosis.

pct_mit_hist <- metadata_df %>%
  ggplot(aes_string("percent.mt")) +
    geom_histogram(bins = 100) +
    labs(x = "% Mitochondrial Expression", y = "Number of Cells") +
    theme_pubr()+
  geom_vline(xintercept = max_percent_mit, linetype = "dashed", color = "red") +
  scale_x_continuous(limits = c(0, 100))

pct_mit_hist

corr_lib_size_lib_comp.ttt<-ggscatter(metadata_df.ttt,y="percent.mt", x="nFeature_RNA",
          color="blue",
          add="reg.line", 
          conf.int = T,
          cor.coef = T,
          ylim=c(0,100),
          title = "Correlation between number of detected genes vs % of RNA mitochondrial",
          cor.coeff.args = list(method = "pearson", label.x = 30,label.y = 90, label.sep = "\n"))


corr_lib_size_lib_comp.ttt + labs(y = "% of RNA mitocondrial", x = "Features")

4.6 Ribosomal RNA read fraction

  • Ribosomal protein read fraction

Possible that degradation of RNA leads to more templating of rRNA-fragments. Proportion ribosomal proteins may be an artifact from handling of samples for cells of the same celltype.


pct_ribo_hist <- metadata_df %>%
  ggplot(aes_string("percent_ribo")) +
    geom_histogram(bins = 100) +
    labs(x = "% Ribosomal Expression", y = "Number of Cells") +
    theme_pubr()+
  scale_x_continuous(limits = c(0, 100))

pct_ribo_hist

PCA (shall we do it?)

Examine PCA/tSNE before/after filtering and make a judgment on whether to remove more/less cells. (Scater tutorial)

Check for batch effects in PCA

5 Filtering

After calculating the quality control metrics for both ATAC and RNA assay we are going to remove cells that are outliers for these metrics which will be save in a seurt oBject list called filtered.cell

filtered.cell<- lapply(all_data, filtering.cell)
#data frame of the filtered cells samples
filtered.cell.df<-lapply(filtered.cell, make.metadata.df)
filtered.cell.df <- map_df(filtered.cell.df, ~as.data.frame(.x), .id="id")
filtered.cell.df<-filtered.cell.df[order(filtered.cell.df$id),]

Data frame of the number of cell with the metrics.

num_fil_cell<- as.data.frame(ddply(filtered.cell.df, .(id), nrow))

#change column names
colnames(num_fil_cell) <- c("library_name","filt_cells")

5.1 Filtered out cells

We are going to collect all that cells that are outliers of the QC metrics and will be save in a list of seurat object called `filtered.out.cells.

#Colleting cells are out of QC thresholds

filtered.out.cells<-lapply(all_data, filter.out.cell)

Create a data frame with the metadata of the filtered out cells.

#data frame of the filtered out cells metadata.
filtered.out.cell.df<-lapply(filtered.out.cells, make.metadata.df)
filtered.out.cell.df <- map_df(filtered.out.cell.df, ~as.data.frame(.x), .id="id")
filtered.out.cell.df<-filtered.out.cell.df[order(filtered.out.cell.df$id),]

#Data frame of number of cells that were out of he QC metrics
num_filOut_cell<- as.data.frame(ddply(filtered.out.cell.df, .(id), nrow))
colnames(num_filOut_cell) <- c("library_name","filt_out_cells")

Merge the initial number of cell, the filtered cell and the filtered out cells data frames

ini_filt_df<-merge(num_cell, num_fil_cell,by = "library_name") 
ini_filt_df<-merge(ini_filt_df,num_filOut_cell, by = "library_name") 
Table 2: Table with the total number of initial and filtered cell
Total Number
library_name initial_cells filt_cells filt_out_cells
BCLL_14_T_1 6919 6622 297
BCLL_14_T_2 6103 5839 264
BCLL_15_T_1 5788 5511 277
BCLL_15_T_2 5845 5532 313
BCLL_2_T_1 10837 9255 1582
BCLL_2_T_2 7910 6732 1178
BCLL_2_T_3 7460 6511 949
BCLL_8_T_1 7181 6883 298
BCLL_8_T_2 7861 7506 355
BCLL_9_T_1 5572 5071 501
BCLL_9_T_2 5530 5094 436

Difference between initial number of cell and filtered number of cell

#create a column with the number of deleted cells which should be the same than the filtered out cell. 
ini_filt_df$del_cells <- (ini_filt_df$initial_cells - ini_filt_df$filt_cells)

Percentages of the deleted and QC pass-filter cells

ini_filt_df$pct_keep_cells <-  round(((ini_filt_df$filt_cells/ini_filt_df$initial_cells)*100),2)
ini_filt_df$pct_del_cells <-  round((100-ini_filt_df$pct_keep_cells),2)

Meaning of each column:

  • pct_del_cells= percentage of deleted cells

  • pct_keep_cells= percentage of keep cells

  • initial_cells= total number of initial cells

  • filt_cells= total number of filtered cells

  • filt_out_cells= total number of filtered out cells (it should match with del_cell column)

  • del_cell= total number of deleted cells

Table 3: Table of total number of initial, filtered and filtered cell and the porcentaje of deleted and non-deleted cells of library
Total Number
Percentage %
library_name initial_cells filt_cells filt_out_cells del_cells pct_keep_cells pct_del_cells
BCLL_14_T_1 6919 6622 297 297 95.71 4.29
BCLL_14_T_2 6103 5839 264 264 95.67 4.33
BCLL_15_T_1 5788 5511 277 277 95.21 4.79
BCLL_15_T_2 5845 5532 313 313 94.64 5.36
BCLL_2_T_1 10837 9255 1582 1582 85.40 14.60
BCLL_2_T_2 7910 6732 1178 1178 85.11 14.89
BCLL_2_T_3 7460 6511 949 949 87.28 12.72
BCLL_8_T_1 7181 6883 298 298 95.85 4.15
BCLL_8_T_2 7861 7506 355 355 95.48 4.52
BCLL_9_T_1 5572 5071 501 501 91.01 8.99
BCLL_9_T_2 5530 5094 436 436 92.12 7.88

Mean of percentage of cell that pass the QC metrics is: 92.13 %

We are deleting on average 7.87 % of bad quality cells.

pct_mit_hist <- filtered.out.cell.df %>%
  ggplot(aes_string("percent.mt")) +
    geom_histogram(bins = 100) +
    labs(x = "% Mitochondrial Expression", y = "Number of Cells") +
    theme_pubr()+
  geom_vline(xintercept = max_percent_mit, linetype = "dashed", color = "red") +
  scale_x_continuous(limits = c(0, 100))

pct_mit_hist

6 Save list of seurat objects and filtered and filtered out seurat objects.

saveRDS(all_data,"../results/R_objects/1.tonsil_multiome_QC.rds")
saveRDS(filtered.cell,"../results/R_objects/2.tonsil_multiome_filtered.rds")
saveRDS(filtered.out.cells,"../results/R_objects/3.tonsil_multiome_filtered_Out.rds")
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib

locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scater_1.22.0               scuttle_1.4.0              
 [3] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
 [5] MatrixGenerics_1.6.0        matrixStats_0.61.0         
 [7] magick_2.7.3                RColorBrewer_1.1-2         
 [9] hdf5r_1.3.5                 EnsDb.Hsapiens.v86_2.99.0  
[11] ensembldb_2.18.2            AnnotationFilter_1.18.0    
[13] GenomicFeatures_1.46.1      AnnotationDbi_1.56.2       
[15] Biobase_2.54.0              GenomicRanges_1.46.1       
[17] GenomeInfoDb_1.30.0         IRanges_2.28.0             
[19] S4Vectors_0.32.3            BiocGenerics_0.40.0        
[21] data.table_1.14.2           reshape2_1.4.4             
[23] plyr_1.8.6                  forcats_0.5.1              
[25] stringr_1.4.0               dplyr_1.0.7                
[27] purrr_0.3.4                 readr_2.1.1                
[29] tidyr_1.1.4                 tibble_3.1.6               
[31] tidyverse_1.3.1             ggpubr_0.4.0               
[33] ggplot2_3.3.5               SeuratObject_4.0.4         
[35] Seurat_4.0.6                Signac_1.5.0               
[37] usethis_2.1.5               kableExtra_1.3.4           
[39] knitr_1.36                  BiocStyle_2.22.0           

loaded via a namespace (and not attached):
  [1] rappdirs_0.3.3            SnowballC_0.7.0          
  [3] rtracklayer_1.54.0        scattermore_0.7          
  [5] bit64_4.0.5               irlba_2.3.5              
  [7] DelayedArray_0.20.0       rpart_4.1-15             
  [9] KEGGREST_1.34.0           RCurl_1.98-1.5           
 [11] generics_0.1.1            ScaledMatrix_1.2.0       
 [13] cowplot_1.1.1             RSQLite_2.2.9            
 [15] RANN_2.6.1                future_1.23.0            
 [17] bit_4.0.4                 tzdb_0.2.0               
 [19] spatstat.data_2.1-2       webshot_0.5.2            
 [21] xml2_1.3.3                lubridate_1.8.0          
 [23] httpuv_1.6.4              assertthat_0.2.1         
 [25] viridis_0.6.2             xfun_0.29                
 [27] hms_1.1.1                 jquerylib_0.1.4          
 [29] evaluate_0.14             promises_1.2.0.1         
 [31] fansi_0.5.0               restfulr_0.0.13          
 [33] progress_1.2.2            dbplyr_2.1.1             
 [35] readxl_1.3.1              igraph_1.2.10            
 [37] DBI_1.1.1                 htmlwidgets_1.5.4        
 [39] sparsesvd_0.2             spatstat.geom_2.3-1      
 [41] ellipsis_0.3.2            backports_1.4.1          
 [43] bookdown_0.24             sparseMatrixStats_1.6.0  
 [45] biomaRt_2.50.1            deldir_1.0-6             
 [47] vctrs_0.3.8               ROCR_1.0-11              
 [49] abind_1.4-5               cachem_1.0.6             
 [51] withr_2.4.3               ggforce_0.3.3            
 [53] BSgenome_1.62.0           checkmate_2.0.0          
 [55] sctransform_0.3.2         GenomicAlignments_1.30.0 
 [57] prettyunits_1.1.1         goftest_1.2-3            
 [59] svglite_2.0.0             cluster_2.1.2            
 [61] lazyeval_0.2.2            crayon_1.4.2             
 [63] labeling_0.4.2            pkgconfig_2.0.3          
 [65] slam_0.1-49               tweenr_1.0.2             
 [67] vipor_0.4.5               nlme_3.1-153             
 [69] ProtGenerics_1.26.0       nnet_7.3-16              
 [71] rlang_0.4.12              globals_0.14.0           
 [73] lifecycle_1.0.1           miniUI_0.1.1.1           
 [75] filelock_1.0.2            BiocFileCache_2.2.0      
 [77] rsvd_1.0.5                modelr_0.1.8             
 [79] dichromat_2.0-0           cellranger_1.1.0         
 [81] polyclip_1.10-0           lmtest_0.9-39            
 [83] Matrix_1.3-4              ggseqlogo_0.1            
 [85] carData_3.0-4             zoo_1.8-9                
 [87] base64enc_0.1-3           beeswarm_0.4.0           
 [89] reprex_2.0.1              ggridges_0.5.3           
 [91] png_0.1-7                 viridisLite_0.4.0        
 [93] rjson_0.2.20              bitops_1.0-7             
 [95] KernSmooth_2.23-20        Biostrings_2.62.0        
 [97] blob_1.2.2                DelayedMatrixStats_1.16.0
 [99] parallelly_1.30.0         jpeg_0.1-9               
[101] rstatix_0.7.0             ggsignif_0.6.3           
[103] beachmat_2.10.0           scales_1.1.1             
[105] memoise_2.0.1             magrittr_2.0.1           
[107] ica_1.0-2                 zlibbioc_1.40.0          
[109] compiler_4.1.2            BiocIO_1.4.0             
[111] fitdistrplus_1.1-6        Rsamtools_2.10.0         
[113] cli_3.1.0                 XVector_0.34.0           
[115] listenv_0.8.0             patchwork_1.1.1          
[117] pbapply_1.5-0             htmlTable_2.3.0          
[119] Formula_1.2-4             MASS_7.3-54              
[121] mgcv_1.8-38               tidyselect_1.1.1         
[123] stringi_1.7.6             highr_0.9                
[125] yaml_2.2.1                BiocSingular_1.10.0      
[127] latticeExtra_0.6-29       ggrepel_0.9.1            
[129] grid_4.1.2                VariantAnnotation_1.40.0 
[131] sass_0.4.0                fastmatch_1.1-3          
[133] tools_4.1.2               future.apply_1.8.1       
[135] parallel_4.1.2            rstudioapi_0.13          
[137] foreign_0.8-81            lsa_0.73.2               
[139] gridExtra_2.3             farver_2.1.0             
[141] Rtsne_0.15                digest_0.6.29            
[143] BiocManager_1.30.16       shiny_1.7.1              
[145] qlcMatrix_0.9.7           Rcpp_1.0.7               
[147] car_3.0-12                broom_0.7.10             
[149] later_1.3.0               RcppAnnoy_0.0.19         
[151] httr_1.4.2                biovizBase_1.42.0        
[153] colorspace_2.0-2          rvest_1.0.2              
[155] XML_3.99-0.8              fs_1.5.2                 
[157] tensor_1.5                reticulate_1.22          
[159] splines_4.1.2             uwot_0.1.11              
[161] RcppRoll_0.3.0            spatstat.utils_2.3-0     
[163] plotly_4.10.0             systemfonts_1.0.3        
[165] xtable_1.8-4              jsonlite_1.7.2           
[167] R6_2.5.1                  Hmisc_4.6-0              
[169] pillar_1.6.4              htmltools_0.5.2          
[171] mime_0.12                 glue_1.6.0               
[173] fastmap_1.1.0             BiocParallel_1.28.3      
[175] BiocNeighbors_1.12.0      codetools_0.2-18         
[177] utf8_1.2.2                lattice_0.20-45          
[179] bslib_0.3.1               spatstat.sparse_2.1-0    
[181] ggbeeswarm_0.6.0          curl_4.3.2               
[183] leiden_0.3.9              survival_3.2-13          
[185] rmarkdown_2.11            docopt_0.7.1             
[187] munsell_0.5.0             GenomeInfoDbData_1.2.7   
[189] haven_2.4.3               gtable_0.3.0             
[191] spatstat.core_2.3-2